import pandas as pd
import re

# 读取CSV文件
df = pd.read_csv('novels.csv', encoding='utf-8')

# 1. 清理书名列 - 删除【】及其内容
def clean_title(title):
    if pd.isna(title):
        return title
    # 删除【】及其中的内容
    title = re.sub(r'【.*?】', '', title)
    # 删除其他特殊符号和多余空格
    title = re.sub(r'[\[\]]', '', title)
    return title.strip()

df['书名'] = df['书名'].apply(clean_title)

# 2. 清理作者列
def clean_author(author):
    if pd.isna(author):
        return "未知"
    # 去除方括号和特殊字符
    author = re.sub(r'[\[\]]', '', author)
    # 处理多作者情况，保留第一个作者
    if '、' in author:
        author = author.split('、')[0]
    if ',' in author:
        author = author.split(',')[0]
    return author.strip()

df['作者'] = df['作者'].fillna('未知').apply(clean_author)

# 3. 清理价格列
def clean_price(price):
    if pd.isna(price):
        return 0.0
    price_str = str(price)
    if '促销价' in price_str:
        price_str = price_str.split(':')[-1]
    match = re.search(r'[\d\.]+', price_str)
    if match:
        return float(match.group())
    return 0.0

df['价格'] = df['价格'].apply(clean_price)

# 4. 检查并处理重复值
print("重复值数量:", df.duplicated().sum())

# 5. 保存清洗后的数据
df.to_csv('cleaned_novels.csv', index=False, encoding='utf-8-sig')

print("数据清洗完成，已保存为 cleaned_novels.csv")